import opendatasets as od
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import squarify
import plotly.express as px
from ipywidgets import interact
import ipywidgets as widgets
from IPython.core.display import HTML
import phik
from phik.report import plot_correlation_matrix
from phik import report
from fitter import Fitter, get_common_distributions, get_distributions
# data collecting
# this method requires you to register and generate API token ----> GO: Kaggle->Account->Create New API Token
od.download(
"https://www.kaggle.com/datasets/prevek18/ames-housing-dataset")
Skipping, found downloaded files in ".\ames-housing-dataset" (use force=True to force download)
# loading data directly into df
# dataset size < 1MB, thus loaded to memory as a whole
data = pd.read_csv('ames-housing-dataset/AmesHousing.csv')
pd.set_option('display.max_columns', None)
data.head()
| Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 526301100 | 20 | RL | 141.0 | 31770 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | Stone | 112.0 | TA | TA | CBlock | TA | Gd | Gd | BLQ | 639.0 | Unf | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | SBrkr | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | Gd | Attchd | 1960.0 | Fin | 2.0 | 528.0 | TA | TA | P | 210 | 62 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 215000 |
| 1 | 2 | 526350040 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal | 105000 |
| 2 | 3 | 526351010 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 |
| 3 | 4 | 526353030 | 20 | RL | 93.0 | 11160 | Pave | NaN | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | None | 0.0 | Gd | TA | CBlock | TA | TA | No | ALQ | 1065.0 | Unf | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | SBrkr | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | TA | Attchd | 1968.0 | Fin | 2.0 | 522.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 244000 |
| 4 | 5 | 527105010 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal | 189900 |
# General info
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2930 entries, 0 to 2929 Data columns (total 82 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order 2930 non-null int64 1 PID 2930 non-null int64 2 MS SubClass 2930 non-null int64 3 MS Zoning 2930 non-null object 4 Lot Frontage 2440 non-null float64 5 Lot Area 2930 non-null int64 6 Street 2930 non-null object 7 Alley 198 non-null object 8 Lot Shape 2930 non-null object 9 Land Contour 2930 non-null object 10 Utilities 2930 non-null object 11 Lot Config 2930 non-null object 12 Land Slope 2930 non-null object 13 Neighborhood 2930 non-null object 14 Condition 1 2930 non-null object 15 Condition 2 2930 non-null object 16 Bldg Type 2930 non-null object 17 House Style 2930 non-null object 18 Overall Qual 2930 non-null int64 19 Overall Cond 2930 non-null int64 20 Year Built 2930 non-null int64 21 Year Remod/Add 2930 non-null int64 22 Roof Style 2930 non-null object 23 Roof Matl 2930 non-null object 24 Exterior 1st 2930 non-null object 25 Exterior 2nd 2930 non-null object 26 Mas Vnr Type 2907 non-null object 27 Mas Vnr Area 2907 non-null float64 28 Exter Qual 2930 non-null object 29 Exter Cond 2930 non-null object 30 Foundation 2930 non-null object 31 Bsmt Qual 2850 non-null object 32 Bsmt Cond 2850 non-null object 33 Bsmt Exposure 2847 non-null object 34 BsmtFin Type 1 2850 non-null object 35 BsmtFin SF 1 2929 non-null float64 36 BsmtFin Type 2 2849 non-null object 37 BsmtFin SF 2 2929 non-null float64 38 Bsmt Unf SF 2929 non-null float64 39 Total Bsmt SF 2929 non-null float64 40 Heating 2930 non-null object 41 Heating QC 2930 non-null object 42 Central Air 2930 non-null object 43 Electrical 2929 non-null object 44 1st Flr SF 2930 non-null int64 45 2nd Flr SF 2930 non-null int64 46 Low Qual Fin SF 2930 non-null int64 47 Gr Liv Area 2930 non-null int64 48 Bsmt Full Bath 2928 non-null float64 49 Bsmt Half Bath 2928 non-null float64 50 Full Bath 2930 non-null int64 51 Half Bath 2930 non-null int64 52 Bedroom AbvGr 2930 non-null int64 53 Kitchen AbvGr 2930 non-null int64 54 Kitchen Qual 2930 non-null object 55 TotRms AbvGrd 2930 non-null int64 56 Functional 2930 non-null object 57 Fireplaces 2930 non-null int64 58 Fireplace Qu 1508 non-null object 59 Garage Type 2773 non-null object 60 Garage Yr Blt 2771 non-null float64 61 Garage Finish 2771 non-null object 62 Garage Cars 2929 non-null float64 63 Garage Area 2929 non-null float64 64 Garage Qual 2771 non-null object 65 Garage Cond 2771 non-null object 66 Paved Drive 2930 non-null object 67 Wood Deck SF 2930 non-null int64 68 Open Porch SF 2930 non-null int64 69 Enclosed Porch 2930 non-null int64 70 3Ssn Porch 2930 non-null int64 71 Screen Porch 2930 non-null int64 72 Pool Area 2930 non-null int64 73 Pool QC 13 non-null object 74 Fence 572 non-null object 75 Misc Feature 106 non-null object 76 Misc Val 2930 non-null int64 77 Mo Sold 2930 non-null int64 78 Yr Sold 2930 non-null int64 79 Sale Type 2930 non-null object 80 Sale Condition 2930 non-null object 81 SalePrice 2930 non-null int64 dtypes: float64(11), int64(28), object(43) memory usage: 1.8+ MB
data_gaps = pd.DataFrame(columns=['ColumnName','ValueCount'])
i=0
for column in data.columns:
value_count = data[column].isna().sum()
if int(value_count) > 0:
data_gaps.loc[i] = [column, value_count]
i+=1
data_gaps = data_gaps.sort_values(by=['ValueCount'], ascending=False)
#data_gaps.plot(x='ColumnName', y='ValueCount', kind ='bar', color ='red')
fig = px.bar(data_gaps, x = "ColumnName", y = "ValueCount")
fig.show()
Powyżej widać, dane kolumny z lukami w danych można podzielić na 3 grupy:
Brak informacji w pewnych przypadkach również może być informacją
Próg odrzucenia danych został określony arbitralnie na poziomie 300 null'i w obserwacjach
def plot_treeMap(data, title, column_values= 'ValueCount',column_category='ColumnName'):
plt.figure(figsize=(16,2))
squarify.plot(sizes=data[column_values], label=data[column_category], alpha=.5)
plt.axis('off')
plt.title(title,fontsize=23,fontweight="bold")
plt.show()
data_gaps_to_drop = data_gaps.where(data_gaps['ValueCount']>300).dropna()
plot_treeMap(data = data_gaps_to_drop, title= "Columns to drop")
data_gaps_to_repair = data_gaps[(data_gaps['ValueCount']<300) & (data_gaps['ValueCount']>10)]
plot_treeMap(data = data_gaps_to_repair, title= "Columns to repair")
data_gaps_minor = data_gaps[(data_gaps['ValueCount']<10)]
plot_treeMap(data = data_gaps_minor, title= "Columns minor blanks")
dropped_names = []
for row_id in range(len(data_gaps_to_drop['ColumnName'])):
dropped_n = str(data_gaps_to_drop['ColumnName'].iloc[row_id])
data.drop([dropped_n], axis=1, inplace=True)
dropped_names.append(dropped_n)
print('successfully dropped columns: ', dropped_names)
successfully dropped columns: ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Fireplace Qu', 'Lot Frontage']
data
| Order | PID | MS SubClass | MS Zoning | Lot Area | Street | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 526301100 | 20 | RL | 31770 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | Stone | 112.0 | TA | TA | CBlock | TA | Gd | Gd | BLQ | 639.0 | Unf | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | SBrkr | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | Attchd | 1960.0 | Fin | 2.0 | 528.0 | TA | TA | P | 210 | 62 | 0 | 0 | 0 | 0 | 0 | 5 | 2010 | WD | Normal | 215000 |
| 1 | 2 | 526350040 | 20 | RH | 11622 | Pave | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | 0 | 6 | 2010 | WD | Normal | 105000 |
| 2 | 3 | 526351010 | 20 | RL | 14267 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | 12500 | 6 | 2010 | WD | Normal | 172000 |
| 3 | 4 | 526353030 | 20 | RL | 11160 | Pave | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | None | 0.0 | Gd | TA | CBlock | TA | TA | No | ALQ | 1065.0 | Unf | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | SBrkr | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | Attchd | 1968.0 | Fin | 2.0 | 522.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2010 | WD | Normal | 244000 |
| 4 | 5 | 527105010 | 60 | RL | 13830 | Pave | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | 0 | 3 | 2010 | WD | Normal | 189900 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2925 | 2926 | 923275080 | 80 | RL | 7937 | Pave | IR1 | Lvl | AllPub | CulDSac | Gtl | Mitchel | Norm | Norm | 1Fam | SLvl | 6 | 6 | 1984 | 1984 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | TA | TA | Av | GLQ | 819.0 | Unf | 0.0 | 184.0 | 1003.0 | GasA | TA | Y | SBrkr | 1003 | 0 | 0 | 1003 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | Detchd | 1984.0 | Unf | 2.0 | 588.0 | TA | TA | Y | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2006 | WD | Normal | 142500 |
| 2926 | 2927 | 923276100 | 20 | RL | 8885 | Pave | IR1 | Low | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1983 | 1983 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | Gd | TA | Av | BLQ | 301.0 | ALQ | 324.0 | 239.0 | 864.0 | GasA | TA | Y | SBrkr | 902 | 0 | 0 | 902 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | Attchd | 1983.0 | Unf | 2.0 | 484.0 | TA | TA | Y | 164 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 2006 | WD | Normal | 131000 |
| 2927 | 2928 | 923400125 | 85 | RL | 10441 | Pave | Reg | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | SFoyer | 5 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | Wd Shng | None | 0.0 | TA | TA | PConc | Gd | TA | Av | GLQ | 337.0 | Unf | 0.0 | 575.0 | 912.0 | GasA | TA | Y | SBrkr | 970 | 0 | 0 | 970 | 0.0 | 1.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN | Y | 80 | 32 | 0 | 0 | 0 | 0 | 700 | 7 | 2006 | WD | Normal | 132000 |
| 2928 | 2929 | 924100070 | 20 | RL | 10010 | Pave | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1974 | 1975 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | Gd | TA | Av | ALQ | 1071.0 | LwQ | 123.0 | 195.0 | 1389.0 | GasA | Gd | Y | SBrkr | 1389 | 0 | 0 | 1389 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 6 | Typ | 1 | Attchd | 1975.0 | RFn | 2.0 | 418.0 | TA | TA | Y | 240 | 38 | 0 | 0 | 0 | 0 | 0 | 4 | 2006 | WD | Normal | 170000 |
| 2929 | 2930 | 924151050 | 60 | RL | 9627 | Pave | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1993 | 1994 | Gable | CompShg | HdBoard | HdBoard | BrkFace | 94.0 | TA | TA | PConc | Gd | TA | Av | LwQ | 758.0 | Unf | 0.0 | 238.0 | 996.0 | GasA | Ex | Y | SBrkr | 996 | 1004 | 0 | 2000 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 9 | Typ | 1 | Attchd | 1993.0 | Fin | 3.0 | 650.0 | TA | TA | Y | 190 | 48 | 0 | 0 | 0 | 0 | 0 | 11 | 2006 | WD | Normal | 188000 |
2930 rows × 76 columns
# General stats
data.describe()
| Order | PID | MS SubClass | Lot Area | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Mas Vnr Area | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | TotRms AbvGrd | Fireplaces | Garage Yr Blt | Garage Cars | Garage Area | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | Mo Sold | Yr Sold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2930.00000 | 2.930000e+03 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2907.000000 | 2929.000000 | 2929.000000 | 2929.000000 | 2929.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2928.000000 | 2928.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2771.000000 | 2929.000000 | 2929.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 |
| mean | 1465.50000 | 7.144645e+08 | 57.387372 | 10147.921843 | 6.094881 | 5.563140 | 1971.356314 | 1984.266553 | 101.896801 | 442.629566 | 49.722431 | 559.262547 | 1051.614544 | 1159.557679 | 335.455973 | 4.676792 | 1499.690444 | 0.431352 | 0.061134 | 1.566553 | 0.379522 | 2.854266 | 1.044369 | 6.443003 | 0.599317 | 1978.132443 | 1.766815 | 472.819734 | 93.751877 | 47.533447 | 23.011604 | 2.592491 | 16.002048 | 2.243345 | 50.635154 | 6.216041 | 2007.790444 | 180796.060068 |
| std | 845.96247 | 1.887308e+08 | 42.638025 | 7880.017759 | 1.411026 | 1.111537 | 30.245361 | 20.860286 | 179.112611 | 455.590839 | 169.168476 | 439.494153 | 440.615067 | 391.890885 | 428.395715 | 46.310510 | 505.508887 | 0.524820 | 0.245254 | 0.552941 | 0.502629 | 0.827731 | 0.214076 | 1.572964 | 0.647921 | 25.528411 | 0.760566 | 215.046549 | 126.361562 | 67.483400 | 64.139059 | 25.141331 | 56.087370 | 35.597181 | 566.344288 | 2.714492 | 1.316613 | 79886.692357 |
| min | 1.00000 | 5.263011e+08 | 20.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1895.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 12789.000000 |
| 25% | 733.25000 | 5.284770e+08 | 20.000000 | 7440.250000 | 5.000000 | 5.000000 | 1954.000000 | 1965.000000 | 0.000000 | 0.000000 | 0.000000 | 219.000000 | 793.000000 | 876.250000 | 0.000000 | 0.000000 | 1126.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1960.000000 | 1.000000 | 320.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2007.000000 | 129500.000000 |
| 50% | 1465.50000 | 5.354536e+08 | 50.000000 | 9436.500000 | 6.000000 | 5.000000 | 1973.000000 | 1993.000000 | 0.000000 | 370.000000 | 0.000000 | 466.000000 | 990.000000 | 1084.000000 | 0.000000 | 0.000000 | 1442.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1979.000000 | 2.000000 | 480.000000 | 0.000000 | 27.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 160000.000000 |
| 75% | 2197.75000 | 9.071811e+08 | 70.000000 | 11555.250000 | 7.000000 | 6.000000 | 2001.000000 | 2004.000000 | 164.000000 | 734.000000 | 0.000000 | 802.000000 | 1302.000000 | 1384.000000 | 703.750000 | 0.000000 | 1742.750000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2002.000000 | 2.000000 | 576.000000 | 168.000000 | 70.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 213500.000000 |
| max | 2930.00000 | 1.007100e+09 | 190.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1526.000000 | 2336.000000 | 6110.000000 | 5095.000000 | 2065.000000 | 1064.000000 | 5642.000000 | 3.000000 | 2.000000 | 4.000000 | 2.000000 | 8.000000 | 3.000000 | 15.000000 | 4.000000 | 2207.000000 | 5.000000 | 1488.000000 | 1424.000000 | 742.000000 | 1012.000000 | 508.000000 | 576.000000 | 800.000000 | 17000.000000 | 12.000000 | 2010.000000 | 755000.000000 |
Powyższe zestawienie ma zastosowanie wyłącznie dla zmiennych ilościowych.
Najprostszym rozwiązaniem problemu obserwacji odstających byłoby pozbycie się danych wykraczających poza wyznaczone granice miar pozycyjnych. Qwartyle mogą okazać się zbyt ekstremalne.
Statystyki zostały przeliczone również dla zmiennych porządkowych - sprawdzanie statystyk pozycyjnych dla nich nakierowane jest raczej na wychwycenie błędów w danych niż określenie kryteriów odrzucenia na podstawie miar pozycyjnych.
print('Slide to check percentiles')
def f(x):
y = x/100
display(data.quantile(y))
interact(f, x=widgets.IntSlider(min=0, max=100, step=1, value=10));
Slide to check percentiles
interactive(children=(IntSlider(value=10, description='x'), Output()), _dom_classes=('widget-interact',))
Podejście do określenia obserwacji odstających w przypadku zmiennych kategorialnych jest nieco bardziej skomplikowane ze względu na brak możliwosci bezpośredniej kwantyfikacji. Z tego względu zadecydowano, że przydatność zmiennych nominalnych na poczet przyszłych modelowań zostanie sprawdzona za pomocą korelacji względem zmiennej objaśnianej oraz względnem innych zmiennych objasniających.
data_nominal_list = []
for column in data.columns:
if data[column].dtypes == 'object':
data_nominal_list.append(column)
print('Columns with nominal data:' , data_nominal_list)
data_nominal = pd.DataFrame()
for column in data_nominal_list:
data_nominal[column] = data[column]
data_nominal['Y-SalesPrice'] = data['SalePrice']
data_nominal.head()
Columns with nominal data: ['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Sale Type', 'Sale Condition']
| MS Zoning | Street | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin Type 2 | Heating | Heating QC | Central Air | Electrical | Kitchen Qual | Functional | Garage Type | Garage Finish | Garage Qual | Garage Cond | Paved Drive | Sale Type | Sale Condition | Y-SalesPrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | Hip | CompShg | BrkFace | Plywood | Stone | TA | TA | CBlock | TA | Gd | Gd | BLQ | Unf | GasA | Fa | Y | SBrkr | TA | Typ | Attchd | Fin | TA | TA | P | WD | Normal | 215000 |
| 1 | RH | Pave | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | Gable | CompShg | VinylSd | VinylSd | None | TA | TA | CBlock | TA | TA | No | Rec | LwQ | GasA | TA | Y | SBrkr | TA | Typ | Attchd | Unf | TA | TA | Y | WD | Normal | 105000 |
| 2 | RL | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | TA | TA | CBlock | TA | TA | No | ALQ | Unf | GasA | TA | Y | SBrkr | Gd | Typ | Attchd | Unf | TA | TA | Y | WD | Normal | 172000 |
| 3 | RL | Pave | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | Hip | CompShg | BrkFace | BrkFace | None | Gd | TA | CBlock | TA | TA | No | ALQ | Unf | GasA | Ex | Y | SBrkr | Ex | Typ | Attchd | Fin | TA | TA | Y | WD | Normal | 244000 |
| 4 | RL | Pave | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | None | TA | TA | PConc | Gd | TA | No | GLQ | Unf | GasA | Gd | Y | SBrkr | TA | Typ | Attchd | Fin | TA | TA | Y | WD | Normal | 189900 |
W celu określenia przydatności danych nominalnych, zastosowana została korelacja 𝜙k(phik), która działa konsekwentnie między zmiennymi kategorialnymi, porządkowymi i przedziałowymi. Dodatkowo wychwytuje nieliniowe zależności.
phik_overview = data_nominal.phik_matrix()
#phik_overview.round(2)
plot_correlation_matrix(phik_overview.values,
x_labels=phik_overview.columns,
y_labels=phik_overview.index,
vmin=0, vmax=1, color_map="Greens",
title=r"correlation $\phi_K$",
fontsize_factor=1.5,
figsize=(42, 18))
plt.tight_layout()
interval columns not set, guessing: ['Y-SalesPrice']
Zastosowanie Phik global correlation do określenia korelacji zmiennej względem wszystkich innych zmiennych w zbiorze danych. Niewątpliwie na początku analizy warto sobie taką wizualizację wygenerować. Koncepcja wydaje się być znana z doboru zmiennych do modeli ekonometrycznych z wykorzystaniem metody Helwiga.
global_correlation, global_labels = data.global_phik()
plot_correlation_matrix(global_correlation,
x_labels=[''], y_labels=global_labels,
vmin=0, vmax=1, figsize=(20,30),
color_map="Greens", title=r"Global Correlation",
fontsize_factor=1.5)
plt.tight_layout()
interval columns not set, guessing: ['Order', 'PID', 'MS SubClass', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold', 'SalePrice']
c:\users\kamil\appdata\local\programs\python\python39\lib\site-packages\phik\phik.py:319: RuntimeWarning: invalid value encountered in sqrt
W celu kalkulacji miar ogólnych oraz podstawowych wizualizacji, zastosowana została biblioteka pandas_profiling Dokumentacja | Implementacja w Streamlit. Narzędzie to pozwala na bardzo szczegółową analize zbiorów danych. W podstawowej wersji generowane są następujące informacje:
Dane nie zostały podzielone na zbiory treningowy i walidacyjny ze względu na to, że punkt pierwszy nie wymaga weryfikacji modelowania.
profile = data.profile_report(
title="Report without correlations",
correlations={
"pearson": {"calculate": True},
"spearman": {"calculate": True},
"kendall": {"calculate": True},
"phi_k": {"calculate": True},
"cramers": {"calculate": True},
},
minimal = True
)
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Z raportu wygenerowanego przez pandas_profiling wiadomo już jak wygląda rozkład danych, są tam też parametry, które wskazują na charakter rozkłau m.in średnia, mediana, skośność czy kurtoza.
Python może to jednak zrobić bezboleśnie dla nas za pomocą biblioteki FITTER
SalePrice = data["SalePrice"].values
plt.figure(figsize=(16,6))
SalePrice_hist = plt.hist(SalePrice, bins=50)
print('popularne rozkłady:',get_common_distributions())
print('wszystkie dostepne rozkłady :',get_distributions())
popularne rozkłady: ['cauchy', 'chi2', 'expon', 'exponpow', 'gamma', 'lognorm', 'norm', 'powerlaw', 'rayleigh', 'uniform'] wszystkie dostepne rozkłady : ['alpha', 'anglit', 'arcsine', 'argus', 'beta', 'betaprime', 'bradford', 'burr', 'burr12', 'cauchy', 'chi', 'chi2', 'cosine', 'crystalball', 'dgamma', 'dweibull', 'erlang', 'expon', 'exponnorm', 'exponpow', 'exponweib', 'f', 'fatiguelife', 'fisk', 'foldcauchy', 'foldnorm', 'gamma', 'gausshyper', 'genexpon', 'genextreme', 'gengamma', 'genhalflogistic', 'genhyperbolic', 'geninvgauss', 'genlogistic', 'gennorm', 'genpareto', 'gilbrat', 'gompertz', 'gumbel_l', 'gumbel_r', 'halfcauchy', 'halfgennorm', 'halflogistic', 'halfnorm', 'hypsecant', 'invgamma', 'invgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'kappa3', 'kappa4', 'ksone', 'kstwo', 'kstwobign', 'laplace', 'laplace_asymmetric', 'levy', 'levy_l', 'levy_stable', 'loggamma', 'logistic', 'loglaplace', 'lognorm', 'loguniform', 'lomax', 'maxwell', 'mielke', 'moyal', 'nakagami', 'ncf', 'nct', 'ncx2', 'norm', 'norminvgauss', 'pareto', 'pearson3', 'powerlaw', 'powerlognorm', 'powernorm', 'rayleigh', 'rdist', 'recipinvgauss', 'reciprocal', 'rice', 'rv_continuous', 'rv_histogram', 'semicircular', 'skewcauchy', 'skewnorm', 'studentized_range', 't', 'trapezoid', 'trapz', 'triang', 'truncexpon', 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'vonmises_line', 'wald', 'weibull_max', 'weibull_min', 'wrapcauchy']
# trying to fit data distribution into one of popular distributions
f = Fitter(SalePrice)
f.fit()
f.summary()
Fitting 106 distributions: 25%|█████████████▏ | 26/106 [00:11<00:39, 2.00it/s]c:\users\kamil\appdata\local\programs\python\python39\lib\site-packages\scipy\stats\_continuous_distns.py:3102: IntegrationWarning: The algorithm does not converge. Roundoff error is detected in the extrapolation table. It is assumed that the requested tolerance cannot be achieved, and that the returned result (if full_output = 1) is the best which can be obtained. c:\users\kamil\appdata\local\programs\python\python39\lib\site-packages\scipy\stats\_continuous_distns.py:3102: IntegrationWarning: The integral is probably divergent, or slowly convergent. Fitting 106 distributions: 36%|███████████████████▎ | 38/106 [00:15<00:14, 4.69it/s]WARNING:root:SKIPPED kstwo distribution (taking more than 30 seconds) Fitting 106 distributions: 46%|████████████████████████▉ | 49/106 [00:30<02:24, 2.53s/it]WARNING:root:SKIPPED burr distribution (taking more than 30 seconds) Fitting 106 distributions: 47%|█████████████████████████▍ | 50/106 [00:30<01:52, 2.01s/it]WARNING:root:SKIPPED gausshyper distribution (taking more than 30 seconds) Fitting 106 distributions: 49%|██████████████████████████▍ | 52/106 [00:34<01:38, 1.82s/it]WARNING:root:SKIPPED genextreme distribution (taking more than 30 seconds) Fitting 106 distributions: 51%|███████████████████████████▌ | 54/106 [00:36<01:24, 1.62s/it]WARNING:root:SKIPPED genpareto distribution (taking more than 30 seconds) Fitting 106 distributions: 54%|█████████████████████████████ | 57/106 [00:44<01:37, 1.98s/it]WARNING:root:SKIPPED johnsonsb distribution (taking more than 30 seconds) Fitting 106 distributions: 57%|██████████████████████████████▌ | 60/106 [00:45<00:44, 1.03it/s]WARNING:root:SKIPPED kappa4 distribution (taking more than 30 seconds) Fitting 106 distributions: 58%|███████████████████████████████ | 61/106 [00:45<00:33, 1.36it/s]WARNING:root:SKIPPED ksone distribution (taking more than 30 seconds) Fitting 106 distributions: 58%|███████████████████████████████▌ | 62/106 [00:45<00:24, 1.78it/s]WARNING:root:SKIPPED levy_stable distribution (taking more than 30 seconds) Fitting 106 distributions: 63%|██████████████████████████████████▏ | 67/106 [00:47<00:17, 2.20it/s]WARNING:root:SKIPPED loggamma distribution (taking more than 30 seconds) Fitting 106 distributions: 65%|███████████████████████████████████▏ | 69/106 [00:50<00:29, 1.26it/s]WARNING:root:SKIPPED lognorm distribution (taking more than 30 seconds) Fitting 106 distributions: 67%|████████████████████████████████████▏ | 71/106 [01:01<01:32, 2.64s/it]WARNING:root:SKIPPED rv_continuous distribution (taking more than 30 seconds) WARNING:root:SKIPPED rv_histogram distribution (taking more than 30 seconds) Fitting 106 distributions: 69%|█████████████████████████████████████▏ | 73/106 [01:01<00:49, 1.50s/it]WARNING:root:SKIPPED lomax distribution (taking more than 30 seconds) Fitting 106 distributions: 70%|█████████████████████████████████████▋ | 74/106 [01:03<00:47, 1.49s/it]WARNING:root:SKIPPED mielke distribution (taking more than 30 seconds) Fitting 106 distributions: 72%|██████████████████████████████████████▋ | 76/106 [01:05<00:38, 1.30s/it]WARNING:root:SKIPPED ncf distribution (taking more than 30 seconds) Fitting 106 distributions: 73%|███████████████████████████████████████▏ | 77/106 [01:13<01:32, 3.18s/it]WARNING:root:SKIPPED nct distribution (taking more than 30 seconds) Fitting 106 distributions: 74%|███████████████████████████████████████▋ | 78/106 [01:14<01:08, 2.46s/it]WARNING:root:SKIPPED ncx2 distribution (taking more than 30 seconds) Fitting 106 distributions: 75%|████████████████████████████████████████▏ | 79/106 [01:15<00:53, 1.99s/it]c:\users\kamil\appdata\local\programs\python\python39\lib\site-packages\scipy\integrate\_quadpack_py.py:879: IntegrationWarning: The maximum number of subdivisions (50) has been achieved. If increasing the limit yields no improvement it is advised to analyze the integrand in order to determine the difficulties. If the position of a local difficulty can be determined (singularity, discontinuity) one will probably gain from splitting up the interval and calling the integrator on the subranges. Perhaps a special-purpose integrator should be used. WARNING:root:SKIPPED norminvgauss distribution (taking more than 30 seconds) WARNING:root:SKIPPED pareto distribution (taking more than 30 seconds) Fitting 106 distributions: 75%|████████████████████████████████████████▊ | 80/106 [01:15<00:43, 1.69s/it]WARNING:root:SKIPPED pearson3 distribution (taking more than 30 seconds) Fitting 106 distributions: 76%|█████████████████████████████████████████▎ | 81/106 [01:16<00:30, 1.23s/it]WARNING:root:SKIPPED powerlognorm distribution (taking more than 30 seconds) Fitting 106 distributions: 77%|█████████████████████████████████████████▊ | 82/106 [01:16<00:21, 1.09it/s]WARNING:root:SKIPPED powerlaw distribution (taking more than 30 seconds) Fitting 106 distributions: 79%|██████████████████████████████████████████▊ | 84/106 [01:16<00:11, 1.87it/s]WARNING:root:SKIPPED powernorm distribution (taking more than 30 seconds) Fitting 106 distributions: 81%|███████████████████████████████████████████▊ | 86/106 [01:17<00:08, 2.31it/s]WARNING:root:SKIPPED rdist distribution (taking more than 30 seconds) Fitting 106 distributions: 82%|████████████████████████████████████████████▎ | 87/106 [01:18<00:11, 1.64it/s]WARNING:root:SKIPPED recipinvgauss distribution (taking more than 30 seconds) Fitting 106 distributions: 83%|████████████████████████████████████████████▊ | 88/106 [01:20<00:18, 1.04s/it]c:\users\kamil\appdata\local\programs\python\python39\lib\site-packages\scipy\integrate\_quadpack_py.py:879: IntegrationWarning: The integral is probably divergent, or slowly convergent. WARNING:root:SKIPPED rice distribution (taking more than 30 seconds) Fitting 106 distributions: 84%|█████████████████████████████████████████████▎ | 89/106 [01:31<01:06, 3.92s/it]WARNING:root:SKIPPED semicircular distribution (taking more than 30 seconds) Fitting 106 distributions: 85%|█████████████████████████████████████████████▊ | 90/106 [01:31<00:47, 2.98s/it]WARNING:root:SKIPPED skewcauchy distribution (taking more than 30 seconds) Fitting 106 distributions: 86%|██████████████████████████████████████████████▎ | 91/106 [01:33<00:38, 2.56s/it]WARNING:root:SKIPPED skewnorm distribution (taking more than 30 seconds) Fitting 106 distributions: 87%|██████████████████████████████████████████████▊ | 92/106 [01:35<00:33, 2.39s/it]WARNING:root:SKIPPED studentized_range distribution (taking more than 30 seconds) Fitting 106 distributions: 88%|███████████████████████████████████████████████▍ | 93/106 [01:35<00:23, 1.80s/it]WARNING:root:SKIPPED t distribution (taking more than 30 seconds) Fitting 106 distributions: 90%|████████████████████████████████████████████████▍ | 95/106 [01:43<00:28, 2.56s/it]WARNING:root:SKIPPED trapezoid distribution (taking more than 30 seconds) Fitting 106 distributions: 91%|████████████████████████████████████████████████▉ | 96/106 [01:44<00:19, 1.99s/it]WARNING:root:SKIPPED trapz distribution (taking more than 30 seconds) Fitting 106 distributions: 92%|█████████████████████████████████████████████████▍ | 97/106 [01:45<00:14, 1.61s/it]WARNING:root:SKIPPED truncexpon distribution (taking more than 30 seconds) Fitting 106 distributions: 92%|█████████████████████████████████████████████████▉ | 98/106 [01:46<00:11, 1.44s/it]WARNING:root:SKIPPED triang distribution (taking more than 30 seconds) Fitting 106 distributions: 93%|██████████████████████████████████████████████████▍ | 99/106 [01:46<00:07, 1.07s/it]WARNING:root:SKIPPED tukeylambda distribution (taking more than 30 seconds) Fitting 106 distributions: 94%|██████████████████████████████████████████████████ | 100/106 [01:46<00:04, 1.25it/s]WARNING:root:SKIPPED vonmises distribution (taking more than 30 seconds) Fitting 106 distributions: 95%|██████████████████████████████████████████████████▌ | 101/106 [01:47<00:03, 1.36it/s]WARNING:root:SKIPPED vonmises_line distribution (taking more than 30 seconds) Fitting 106 distributions: 96%|███████████████████████████████████████████████████ | 102/106 [01:47<00:02, 1.59it/s]WARNING:root:SKIPPED wald distribution (taking more than 30 seconds) Fitting 106 distributions: 97%|███████████████████████████████████████████████████▌ | 103/106 [01:48<00:02, 1.46it/s]WARNING:root:SKIPPED weibull_max distribution (taking more than 30 seconds) Fitting 106 distributions: 98%|████████████████████████████████████████████████████ | 104/106 [01:50<00:02, 1.15s/it]WARNING:root:SKIPPED weibull_min distribution (taking more than 30 seconds) Fitting 106 distributions: 99%|████████████████████████████████████████████████████▌| 105/106 [02:01<00:04, 4.01s/it]WARNING:root:SKIPPED wrapcauchy distribution (taking more than 30 seconds) Fitting 106 distributions: 100%|█████████████████████████████████████████████████████| 106/106 [02:02<00:00, 1.16s/it]
| sumsquare_error | aic | bic | kl_div | ks_statistic | ks_pvalue | |
|---|---|---|---|---|---|---|
| laplace_asymmetric | 1.150717e-11 | 3076.291240 | -97166.519969 | inf | 0.014773 | 0.539751 |
| johnsonsu | 1.667621e-11 | 3058.371080 | -96071.469531 | inf | 0.024229 | 0.063082 |
| exponnorm | 1.863510e-11 | 3106.066912 | -95754.035710 | inf | 0.026330 | 0.033800 |
| fisk | 2.598162e-11 | 3075.585263 | -94780.272972 | inf | 0.036141 | 0.000924 |
| alpha | 2.748285e-11 | 3099.690618 | -94615.686022 | inf | 0.034058 | 0.002181 |
# choosing best fit for data
f.get_best(method = 'sumsquare_error')
{'laplace_asymmetric': {'kappa': 0.5619392704241011,
'loc': 128499.99999997897,
'scale': 42949.679716288505}}
Dopasowanie rozkładu w tej bibliotece można postrzegać przez różne kryteria takie jak:
Wyniki dla każdej miary są inne.
print(f.summary(Nbest=10))
sumsquare_error aic bic kl_div \
laplace_asymmetric 1.150717e-11 3076.291240 -97166.519969 inf
johnsonsu 1.667621e-11 3058.371080 -96071.469531 inf
exponnorm 1.863510e-11 3106.066912 -95754.035710 inf
fisk 2.598162e-11 3075.585263 -94780.272972 inf
alpha 2.748285e-11 3099.690618 -94615.686022 inf
moyal 3.019415e-11 3113.513555 -94347.997227 inf
invgamma 3.131661e-11 3134.506451 -94233.067968 inf
betaprime 3.221017e-11 3138.770036 -94142.654143 inf
f 3.270396e-11 3136.918366 -94098.077421 inf
genlogistic 3.313074e-11 3202.040139 -94068.071334 inf
ks_statistic ks_pvalue
laplace_asymmetric 0.014773 5.397513e-01
johnsonsu 0.024229 6.308189e-02
exponnorm 0.026330 3.379961e-02
fisk 0.036141 9.241930e-04
alpha 0.034058 2.181046e-03
moyal 0.040736 1.161196e-04
invgamma 0.037863 4.371988e-04
betaprime 0.054838 4.241087e-08
f 0.053178 1.215463e-07
genlogistic 0.048284 2.244731e-06
# Some custom formatting
HTML("""
<style>
h1 {
text-align:center;
color:#008080;
}
h2 {
text-align:center;
color:#008080;
}
div.inner_cell {
background-color: #F0F8FF;
}
</style>
""")